'''
Created on Jun 28, 2010

@author: oabalbin
'''
import sys
import copy
import pickle
import numpy as np
from optparse import OptionParser
from datetime import datetime
from collections import defaultdict, deque

import amarillo.copa.copa_analysis as ca
import amarillo.copa.subset_genes as sg
import amarillo.parser.parse_gene_lists as pg
import RNAseq.array as seqarray

if __name__ == '__main__':
        
    gene_list_folder0 = '/data/metadata/panther_gene_list/'
    gene_list_folder = '/data/metadata/my_gene_lists/'
    
    # Get all the gene files in the folder
    thosefiles0 = sg.read_files_folder(gene_list_folder0,'.txt')
    thosefile = sg.read_files_folder(gene_list_folder,'.txt')
    
    #cohort_outlier_profile=defaultdict(deque)
    panter_gene_list = []

    for gene_list_file in thosefiles0:
        
        gene_list_name = gene_list_file.split('/')[-1]
                
        # subset of genes and samples to consider
        thisgenelist = pg.panther_gene_list(open(gene_list_file))
        panter_gene_list.append(list(thisgenelist))
    
    panter_gene_list = sum(panter_gene_list,[])
    
    all_known_genes = []
    for gene_list_file in thosefile:
        thisgenelist = ca.list_of_names(open(gene_list_file))
        all_known_genes.append(list(thisgenelist))
    
    all_known_genes = sum(all_known_genes,[])
    
    #gene_difference =  set(panter_gene_list).difference(set(all_known_genes))
    gene_difference =  set(all_known_genes).difference(set(panter_gene_list))
    outfile = open('/data/metadata/panther_gene_list/pantherGeneList_known_notfound.txt','w')
    
    for gene in gene_difference:
        outfile.write('not found in panther ;'+gene+'\n')
        
        
    print gene_difference
    print len(gene_difference)

